clean and tidy the airbnb data

#plotly scatter plot

airbnb_data %>%
  mutate(text_label = str_c("Price: $", price, '\nRating: ', rating, '\nName: ', name)) %>% 
  plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
          alpha = 0.5, 
          color = ~price,
          text = ~text_label)
common_neighborhoods = airbnb_data %>% 
  count(neighbourhood, sort = TRUE) %>% 
  top_n(8) %>% 
  dplyr::select(neighbourhood)
## Selecting by n
#neighborhood-price
inner_join(airbnb_data, common_neighborhoods,
             by = "neighbourhood") %>% 
  mutate(neighbourhood = fct_reorder(neighbourhood, price)) %>% 
  plot_ly(y = ~price, color = ~neighbourhood, type = "box",
          colors = "Set2")
#?
airbnb_data %>% 
  count(neighbourhood) %>% 
  mutate(neighbourhood = fct_reorder(neighbourhood, n)) %>% 
  plot_ly(x = ~neighbourhood, y = ~n, color = ~neighbourhood, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
#corplot
corr<-airbnb_data%>%
  dplyr::select(-boro , -neighbourhood, -room_type, -name, -latitude, -longitude)
corrplot(cor(corr), method="square",shade.col=NA, tl.col="black", tl.srt=45)

corrplot(cor(airbnb), method = "square", shade.col = NA)

numeric_data = airbnb_data%>%
  dplyr::select(price,everything(),-boro, -neighbourhood, -room_type, -name, -longitude, -latitude)

plot(numeric_data$price)

featurePlot(x=numeric_data[,2:5],y=numeric_data$price, plot = 'pairs')

#splitting data into train and test
sample_size=floor(0.75*nrow(numeric_data))

set.seed(1)

sample_air = sample(seq_len(nrow(numeric_data)), size = sample_size)
train = numeric_data[sample_air, ]
test = numeric_data[-sample_air, ]

#LSE on the traing data
ln_model = lm(price~., data=train)
pred_data = predict(ln_model, test)
test_error = mean((pred_data-test$price)^2)
test_error
## [1] 6779.265
#ridge regression
x_test = model.matrix(price~.,train)[,-1]
y_test = train$price
grid_ridge = 10^seq(10.,-5, length = 1000)

ridge_model = glmnet(x_test,y_test,alpha=0,lambda = grid_ridge)
cv.out = cv.glmnet(x_test,y_test,alpha=0, lambda = grid_ridge,
                   type.measure = "mse")
plot(cv.out)

best_lambda = cv.out$lambda.min
round(best_lambda,3)
## [1] 1.231
best_ridge_mod = glmnet(x_test,y_test,alpha=0,lambda = best_lambda)

reg_pred=predict(best_ridge_mod ,s=best_lambda,newx=x_test)

test_error2= 
mean((reg_pred-y_test)^2);test_error2
## [1] 6535.136
set.seed(2)

grid_lasso = exp(seq(1,-8,length=100))
lasso_mod = glmnet(x_test,y_test,alpha=1,lambda= grid_lasso)

cv.out2 = cv.glmnet(x_test,y_test,alpha=1,lambda= grid_lasso)
best_lambda2 = cv.out2$lambda_min

plot(cv.out2)

pred_lasso = predict(lasso_mod ,s=best_lambda2, newx=x_test)
test_error3 = mean((pred_lasso-y_test)^2);test_error3
## [1] 6537.716
coefficients = predict(lasso_mod, s=best_lambda2, type="coefficients") %>%
                as.matrix()
non_zero_coeff = coefficients[coefficients[,1] != 0,]
non_zero_coeff%>% knitr::kable()
s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s20 s21 s22 s23 s24 s25 s26 s27 s28 s29 s30 s31 s32 s33 s34 s35 s36 s37 s38 s39 s40 s41 s42 s43 s44 s45 s46 s47 s48 s49 s50 s51 s52 s53 s54 s55 s56 s57 s58 s59 s60 s61 s62 s63 s64 s65 s66 s67 s68 s69 s70 s71 s72 s73 s74 s75 s76 s77 s78 s79 s80 s81 s82 s83 s84 s85 s86 s87 s88 s89 s90 s91 s92 s93 s94 s95 s96 s97 s98 s99
(Intercept) -14.6337302 -17.5382337 -20.1904861 -22.5933756 -24.7553207 -26.7290819 -28.5313247 -30.1769539 -31.7221908 -33.1954397 -34.5444728 -35.7762205 -36.9009294 -37.927902 -38.8656313 -39.7218726 -40.5037072 -41.217601 -41.8694577 -42.4646687 -43.0081563 -43.4999213 -43.9532638 -44.3673820 -44.7455244 -45.0908072 -45.4060852 -45.6939658 -45.9568297 -46.1968510 -46.4104660 -46.6157557 -46.7942310 -46.9614130 -47.1141433 -47.2535970 -47.3809307 -47.4971988 -47.603363 -47.7003018 -47.7888166 -47.8696395 -47.9434390 -48.0108253 -48.0723558 -48.1285394 -48.179841 -48.2266839 -48.2694564 -48.3085121 -48.3441738 -48.3767366 -48.4064697 -48.4336190 -48.458409 -48.4810449 -48.5017136 -48.5205863 -48.5378190 -48.5535541 -48.5679219 -48.5810411 -48.5930203 -48.6039585 -48.6139462 -48.6230659 -48.6313932 -48.6389968 -48.6459396 -48.6522792 -48.6580678 -48.6633534 -48.6681797 -48.6725866 -48.6766105 -48.680285 -48.6836398 -48.6867032 -48.6895004 -48.6920546 -48.6943867 -48.6965163 -48.6984607 -48.7002362 -48.7018574 -48.7033377 -48.7046894 -48.7059237 -48.707051 -48.7080797 -48.7090193 -48.7098772 -48.7106606 -48.7113760 -48.7120291 -48.7126256 -48.7131701 -48.7136674 -48.7141214 -48.714536
rating 30.8261182 31.4158261 31.9543195 32.4511295 32.9139766 33.3365343 33.7223720 34.0746807 34.4031525 34.7120665 34.9951050 35.2535366 35.4895104 35.704978 35.9017222 36.0813691 36.2454049 36.395186 36.5319514 36.6568319 36.7708604 36.8740607 36.9691770 37.0560607 37.1353963 37.2078379 37.2739844 37.3343827 37.3895325 37.4398899 37.4848583 37.5277840 37.5653518 37.6004570 37.6324980 37.6617453 37.6884485 37.7128305 37.735093 37.7554218 37.7739837 37.7909325 37.8064084 37.8205395 37.8334426 37.8452245 37.855983 37.8658057 37.8747752 37.8829653 37.8904436 37.8972721 37.9035073 37.9092005 37.914399 37.9191459 37.9234802 37.9274378 37.9310516 37.9343513 37.9373642 37.9401154 37.9426274 37.9449212 37.9470157 37.9489281 37.9506743 37.9522688 37.9537248 37.9550542 37.9562681 37.9573765 37.9583886 37.9593127 37.9601566 37.960927 37.9616306 37.9622730 37.9628596 37.9633952 37.9638843 37.9643308 37.9647386 37.9651109 37.9654509 37.9657613 37.9660448 37.9663036 37.966540 37.9667557 37.9669528 37.9671327 37.9672970 37.9674470 37.9675839 37.9677090 37.9678232 37.9679275 37.9680227 37.968110
availability_365 0.0232891 0.0256337 0.0277746 0.0298086 0.0318048 0.0336275 0.0352917 0.0368114 0.0381007 0.0390997 0.0400153 0.0408514 0.0416149 0.042312 0.0429485 0.0435297 0.0440604 0.044545 0.0449875 0.0453915 0.0457605 0.0461016 0.0464091 0.0466896 0.0469457 0.0471796 0.0473931 0.0475881 0.0477661 0.0479287 0.0480838 0.0482132 0.0483421 0.0484564 0.0485594 0.0486532 0.0487387 0.0488168 0.048888 0.0489531 0.0490125 0.0490668 0.0491163 0.0491616 0.0492029 0.0492406 0.049275 0.0493065 0.0493352 0.0493614 0.0493854 0.0494072 0.0494272 0.0494454 0.049462 0.0494772 0.0494911 0.0495038 0.0495154 0.0495259 0.0495356 0.0495444 0.0495524 0.0495598 0.0495665 0.0495726 0.0495782 0.0495833 0.0495879 0.0495922 0.0495961 0.0495996 0.0496029 0.0496058 0.0496085 0.049611 0.0496132 0.0496153 0.0496172 0.0496189 0.0496205 0.0496219 0.0496232 0.0496244 0.0496255 0.0496265 0.0496274 0.0496282 0.049629 0.0496296 0.0496303 0.0496309 0.0496314 0.0496319 0.0496323 0.0496327 0.0496331 0.0496334 0.0496337 0.049634
calculated_host_listings_count -2.6397419 -2.7643308 -2.8780926 -2.9806611 -3.0720127 -3.1554270 -3.2315928 -3.3011397 -3.3642121 -3.4211202 -3.4730549 -3.5204772 -3.5637785 -3.603317 -3.6394195 -3.6723848 -3.7024855 -3.729970 -3.7550669 -3.7779825 -3.7989068 -3.8180578 -3.8355016 -3.8514277 -3.8659696 -3.8792479 -3.8913723 -3.9024431 -3.9125518 -3.9217821 -3.9302497 -3.9379103 -3.9449663 -3.9513840 -3.9572407 -3.9625878 -3.9674700 -3.9719280 -3.975999 -3.9797154 -3.9831092 -3.9862081 -3.9890378 -3.9916215 -3.9939807 -3.9961349 -3.998102 -3.9998979 -4.0015379 -4.0030354 -4.0044027 -4.0056513 -4.0067913 -4.0078323 -4.008783 -4.0096507 -4.0104431 -4.0111668 -4.0118275 -4.0124308 -4.0129817 -4.0134847 -4.0139440 -4.0143634 -4.0147464 -4.0150960 -4.0154153 -4.0157069 -4.0159731 -4.0162161 -4.0164381 -4.0166407 -4.0168258 -4.0169948 -4.0171490 -4.017290 -4.0174186 -4.0175360 -4.0176433 -4.0177412 -4.0178306 -4.0179123 -4.0179868 -4.0180549 -4.0181171 -4.0181738 -4.0182256 -4.0182730 -4.018316 -4.0183556 -4.0183917 -4.0184246 -4.0184546 -4.0184820 -4.0185071 -4.0185299 -4.0185508 -4.0185699 -4.0185873 -4.018603
set.seed(46)
train_price_tree = sample(1:nrow(airbnb), nrow(airbnb)/2)

# grow tree 
fit_price = rpart(price ~ ., data = airbnb, subset = train_price_tree)

printcp(fit_price) # display the results 
## 
## Regression tree:
## rpart(formula = price ~ ., data = airbnb, subset = train_price_tree)
## 
## Variables actually used in tree construction:
## [1] availability_365 id               neighbourhood    room_type       
## 
## Root node error: 721426951/20376 = 35406
## 
## n= 20376 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.101805      0   1.00000 1.00007 0.16775
## 2 0.021830      1   0.89819 0.89845 0.16824
## 3 0.010812      2   0.87636 0.89125 0.16662
## 4 0.010460      3   0.86555 0.90215 0.16701
## 5 0.010000      4   0.85509 0.90196 0.16704
plotcp(fit_price) # visualize cross-validation results 

summary(fit_price) # detailed summary of splits
## Call:
## rpart(formula = price ~ ., data = airbnb, subset = train_price_tree)
##   n= 20376 
## 
##           CP nsplit rel error    xerror      xstd
## 1 0.10180536      0 1.0000000 1.0000674 0.1677542
## 2 0.02183025      1 0.8981946 0.8984541 0.1682425
## 3 0.01081207      2 0.8763644 0.8912506 0.1666185
## 4 0.01045992      3 0.8655523 0.9021475 0.1670055
## 5 0.01000000      4 0.8550924 0.9019622 0.1670371
## 
## Variable importance
##                      room_type               availability_365 
##                             46                             10 
## calculated_host_listings_count                 minimum_nights 
##                              9                              8 
##                             id         review_scores_location 
##                              7                              6 
##              number_of_reviews                  neighbourhood 
##                              5                              5 
##            neighbourhood_group              reviews_per_month 
##                              3                              1 
## 
## Node number 1: 20376 observations,    complexity param=0.1018054
##   mean=145.2801, MSE=35405.72 
##   left son=2 (10398 obs) right son=3 (9978 obs)
##   Primary splits:
##       room_type                      < 1.5      to the right, improve=0.101805400, (0 missing)
##       neighbourhood_group            < 2.5      to the left,  improve=0.015634360, (0 missing)
##       neighbourhood                  < 29.5     to the left,  improve=0.009643181, (0 missing)
##       availability_365               < 364.5    to the left,  improve=0.008202421, (0 missing)
##       calculated_host_listings_count < 1.5      to the right, improve=0.005307350, (0 missing)
##   Surrogate splits:
##       calculated_host_listings_count < 1.5      to the right, agree=0.601, adj=0.186, (0 split)
##       minimum_nights                 < 1.5      to the left,  agree=0.591, adj=0.164, (0 split)
##       review_scores_location         < 9.5      to the left,  agree=0.560, adj=0.101, (0 split)
##       number_of_reviews              < 4.5      to the left,  agree=0.549, adj=0.078, (0 split)
##       neighbourhood_group            < 2.5      to the left,  agree=0.543, adj=0.067, (0 split)
## 
## Node number 2: 10398 observations
##   mean=86.46769, MSE=21407.37 
## 
## Node number 3: 9978 observations,    complexity param=0.02183025
##   mean=206.568, MSE=42632.59 
##   left son=6 (9563 obs) right son=7 (415 obs)
##   Primary splits:
##       availability_365               < 360.5    to the left,  improve=0.037022510, (0 missing)
##       neighbourhood_group            < 2.5      to the left,  improve=0.019142500, (0 missing)
##       neighbourhood                  < 29.5     to the left,  improve=0.010184230, (0 missing)
##       calculated_host_listings_count < 4.5      to the left,  improve=0.009664178, (0 missing)
##       reviews_per_month              < 0.035    to the right, improve=0.009038157, (0 missing)
##   Surrogate splits:
##       calculated_host_listings_count < 11.5     to the left,  agree=0.961, adj=0.058, (0 split)
## 
## Node number 6: 9563 observations
##   mean=198.2919, MSE=28469.96 
## 
## Node number 7: 415 observations,    complexity param=0.01081207
##   mean=397.2795, MSE=331038.2 
##   left son=14 (395 obs) right son=15 (20 obs)
##   Primary splits:
##       id                             < 16515870 to the left,  improve=0.05677735, (0 missing)
##       calculated_host_listings_count < 4.5      to the left,  improve=0.03346697, (0 missing)
##       minimum_nights                 < 1.5      to the right, improve=0.02794635, (0 missing)
##       neighbourhood_group            < 2.5      to the left,  improve=0.02410042, (0 missing)
##       reviews_per_month              < 0.135    to the right, improve=0.02355728, (0 missing)
## 
## Node number 14: 395 observations
##   mean=366.4304, MSE=192384.9 
## 
## Node number 15: 20 observations,    complexity param=0.01045992
##   mean=1006.55, MSE=2679435 
##   left son=30 (8 obs) right son=31 (12 obs)
##   Primary splits:
##       neighbourhood    < 93.5     to the left,  improve=0.14081460, (0 missing)
##       availability_365 < 363.5    to the left,  improve=0.04684115, (0 missing)
##       id               < 16875470 to the right, improve=0.03264132, (0 missing)
##       minimum_nights   < 2.5      to the left,  improve=0.01088263, (0 missing)
##   Surrogate splits:
##       id                     < 17254840 to the right, agree=0.80, adj=0.500, (0 split)
##       review_scores_location < 4        to the right, agree=0.70, adj=0.250, (0 split)
##       reviews_per_month      < 0.5      to the right, agree=0.70, adj=0.250, (0 split)
##       number_of_reviews      < 0.5      to the right, agree=0.70, adj=0.250, (0 split)
##       availability_365       < 362.5    to the left,  agree=0.65, adj=0.125, (0 split)
## 
## Node number 30: 8 observations
##   mean=254.25, MSE=24762.94 
## 
## Node number 31: 12 observations
##   mean=1508.083, MSE=3820377
plot(as.party(fit_price))

pred_fit_price = predict(fit_price, newdata = airbnb[-train_price_tree,])


# no pruning
ctree_price = ctree(price~., airbnb, subset = train_price_tree)

#summary(ctree.boston)
#plot(ctree_price) useless

pred_ctree_price = predict(ctree_price, newdata = airbnb[-train_price_tree,])


# tune over maximum depth, method = "rpart2" (plots Max Tree depth)
# tune over cp, method = "rpart" (plots Complexity Parameter)
rpartTune = train(airbnb[,-11], airbnb$price, 
                   method = "rpart2",
                   trControl = trainControl(method = "cv", number =10))
## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning: Setting row names on a tibble is deprecated.
plot(rpartTune)

# create additional plots 
# two plots on one page par(mfrow=c(1,2))
rsq.rpart(fit_price) # visualize cross-validation results   
## 
## Regression tree:
## rpart(formula = price ~ ., data = airbnb, subset = train_price_tree)
## 
## Variables actually used in tree construction:
## [1] availability_365 id               neighbourhood    room_type       
## 
## Root node error: 721426951/20376 = 35406
## 
## n= 20376 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.101805      0   1.00000 1.00007 0.16775
## 2 0.021830      1   0.89819 0.89845 0.16824
## 3 0.010812      2   0.87636 0.89125 0.16662
## 4 0.010460      3   0.86555 0.90215 0.16701
## 5 0.010000      4   0.85509 0.90196 0.16704

# plot tree 
plot(fit_price, uniform=TRUE, main="Regression Tree for Price ")
text(fit_price, use.n=TRUE, all=TRUE, cex=.8)

# prune the tree 
pfit_price = prune(fit_price, cp=0.010460) # from cptable   

# plot the pruned tree 
plot(pfit_price, uniform=TRUE, main="Pruned Regression Tree for Price")
text(pfit_price, use.n=TRUE, all=TRUE, cex=.8)

set.seed(4)

train_lq = sample(1:dim(airbnb)[1], 15000, replace = FALSE)

ctrl = trainControl(method = "cv",
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE)

lda.fit = lda(neighbourhood_group ~ ., data = airbnb, subset = train_lq)

lda.pred = predict(lda.fit, newdata = airbnb[-train_lq,])

#roc.lda = roc(airbnb$neighbourhood_group[-train_lq], lda.pred$posterior[,2])
#plot(roc.lda, legacy.axes = TRUE)


#ldafit1 = train(x = airbnb[,-1],
#                 y = airbnb$neighbourhood_group,
#                 method = "lda",
#                 preProc = c("center","scale"),
#                 metric = "ROC",
#                 trControl = ctrl)


#qda.fit = qda(price ~ Lag1+Lag2, data = airbnb ,subset = train_ql)
airbnb_tree_data = dplyr::select(airbnb_data, -neighbourhood, -reviews_per_month,- name, -latitude, -longitude)

set.seed(123)
n = nrow(airbnb_tree_data)
trainIndex = sample(1:n, size = round(0.5*n), replace=FALSE)
airbnb_tree_train = airbnb_tree_data[trainIndex ,]
airbnb_tree_test = airbnb_tree_data[-trainIndex ,]


#pruned tree
tree.airbnb <- tree(boro ~ ., data = airbnb_tree_train)
summary(tree.airbnb)
## 
## Classification tree:
## tree(formula = boro ~ ., data = airbnb_tree_train)
## Variables actually used in tree construction:
## [1] "price"
## Number of terminal nodes:  3 
## Residual mean deviance:  1.956 = 29370 / 15020 
## Misclassification error rate: 0.4504 = 6765 / 15020
cv.tree.airbnb <- cv.tree(tree.airbnb, FUN = prune.misclass)
minsize=cv.tree.airbnb$size[which.min(cv.tree.airbnb$dev)]

prune.tree.airbnb <- prune.misclass(tree.airbnb, best = minsize)
summary(prune.tree.airbnb)
## 
## Classification tree:
## tree(formula = boro ~ ., data = airbnb_tree_train)
## Variables actually used in tree construction:
## [1] "price"
## Number of terminal nodes:  3 
## Residual mean deviance:  1.956 = 29370 / 15020 
## Misclassification error rate: 0.4504 = 6765 / 15020
#pruned tree plot
plot(prune.tree.airbnb)
text(prune.tree.airbnb, pretty = 0)

#prediction
predict.pruned.tree <- predict(prune.tree.airbnb, airbnb_tree_test, type='class')
table(predict.pruned.tree, airbnb_tree_test$boro)
##                    
## predict.pruned.tree Bronx Brooklyn Manhattan Queens Staten Island
##       Bronx             0        0         0      0             0
##       Brooklyn        160     2872      1649    869            60
##       Manhattan        64     3354      5415    545            32
##       Queens            0        0         0      0             0
##       Staten Island     0        0         0      0             0
basic.mse=mean(predict.pruned.tree != airbnb_tree_test$boro)

the optimal tree size equals to 3, the training data error rate is 0.4531. Use test dataset to predict, the error rate is 0.4448735.

bag.airbnb <- randomForest(boro ~ ., data = airbnb_tree_train, mtry = 10, ntree = 500, importance = TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within
## valid range
pred.bag.airbnb <- predict(bag.airbnb, newdata = airbnb_tree_test)
table(pred.bag.airbnb, airbnb_tree_test$boro )
##                
## pred.bag.airbnb Bronx Brooklyn Manhattan Queens Staten Island
##   Bronx             7       10         8     12             0
##   Brooklyn        150     3434      2256    770            56
##   Manhattan        40     2516      4662    419            22
##   Queens           27      259       135    211            13
##   Staten Island     0        7         3      2             1
bag.mse=mean(pred.bag.airbnb!= airbnb_tree_test$boro )
#0.4466045 test error.
varImpPlot(bag.airbnb)

#price and availability is important
rf.airbnb <- randomForest(boro ~ ., data = airbnb_tree_train, mtry = 5, ntree = 500, importance = TRUE)
pred.rf.airbnb <- predict(rf.airbnb, newdata = airbnb_tree_test)
table(pred.rf.airbnb, airbnb_tree_test$boro )
##                
## pred.rf.airbnb  Bronx Brooklyn Manhattan Queens Staten Island
##   Bronx             5        8        10     11             0
##   Brooklyn        154     3441      2217    784            62
##   Manhattan        40     2530      4709    403            20
##   Queens           25      239       124    214            10
##   Staten Island     0        8         4      2             0
rf.mse=mean(pred.rf.airbnb!= airbnb_tree_test$boro )
#0.4442743 test error
varImpPlot(rf.airbnb)

##price and availability is important
airbnb_tree_train$boro <- as.numeric(airbnb_tree_train$boro == "Manhattan") 
airbnb_tree_test$boro <- as.numeric(airbnb_tree_test$boro == "Manhattan") 
boost.airbnb = gbm(boro ~ ., data = airbnb_tree_train, distribution = "bernoulli", n.trees = 5000, interaction.depth = 4)
yhat.boost = predict(boost.airbnb, newdata = airbnb_tree_test,
n.trees = 5000, type = "response")
pred.boost.airbnb <- ifelse(yhat.boost > 0.5, 1, 0)
table(pred.boost.airbnb, airbnb_tree_test$boro)
##                  
## pred.boost.airbnb    0    1
##                 0 5745 2517
##                 1 2211 4547
boost.mse = (2187 + 2596)/(2187 + 4483 + 2596 + 5754)
#(2187 + 2596)/(2187 + 4483 + 2596 + 5754) =  0.318442   test error
summary(boost.airbnb)

##                                                           var   rel.inf
## price                                                   price 63.492283
## room_type                                           room_type 14.741522
## rating                                                 rating  8.596436
## availability_365                             availability_365  7.383736
## calculated_host_listings_count calculated_host_listings_count  3.586075
## number_of_reviews                           number_of_reviews  2.199948
#price is important
compare_df = data.frame(Boosting_MSE = boost.mse, Random_forest_MSE =rf.mse, Bagging_MSE = bag.mse, Decision_trees_MSE = basic.mse)
compare_df
##   Boosting_MSE Random_forest_MSE Bagging_MSE Decision_trees_MSE
## 1    0.3184421         0.4428096   0.4464048           0.448269